books <-gutenberg_download(c(580,730,967,700,917,968,821,766,1023,786,963,1400,883,564))
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
Q1
stop_words <- rbind(stop_words, c("miss", "SMART"))
stop_words <- rbind(stop_words, c("sir", "SMART"))
words <- books %>% unnest_tokens(word, text)
word_counts <- words %>% anti_join(stop_words, by = "word") %>% count(word, sort = TRUE) %>% top_n(20)
## Selecting by n
hchart(word_counts, "column", hcaes(x = word, y = n))
Q2
first200 <- words %>% anti_join(stop_words, by = "word") %>% count(word, sort = TRUE) %>% top_n(200)
## Selecting by n
wordcloud2(first200, size=0.2, figPath = "~/Desktop/fig1.png")
Q3
numbers <- c(580,730,967,700,917,968,821,766,1023,786,963,1400,883,564)
for (i in numbers){
str_replace_all(gutenberg_download(i), "[[:punct:]]","") -> clean
str_replace_all(clean, "\n","") -> clean
str_replace_all(clean, "\"","") -> clean
str_replace_all(clean, "xa0xa0","") -> clean
firstcap <- str_extract_all(clean, "\\b[A-Z]\\w+")[[2]]
firstnotcap <- str_extract_all(clean, "\\b[a-z]\\w+")[[2]]
allcap <- str_extract_all(clean, '\\b[A-Z]+\\b')[[2]]
firstcap <- sapply(firstcap,tolower)
allcap <- sapply(allcap,tolower)
allcap <- data.frame(allcap)
colnames(allcap) <- "word"
firstcap <- data.frame(firstcap)
firstnotcap <- data.frame(firstnotcap)
colnames(firstcap) <- "word"
colnames(firstnotcap) <- "word"
filtered <- firstcap %>% anti_join(firstnotcap, by = "word") %>% anti_join(stop_words, by = "word")
filtered <- filtered %>% group_by(word) %>% summarise(n = n()) %>% arrange(desc(n)) %>% top_n(5)
print(ggplot(filtered, aes(x = word, y = n)) + geom_bar(stat = "identity") + ggtitle(gutenberg_metadata %>% filter(gutenberg_id == i ) %>% select(title)))
}














Q4
positive <- get_sentiments("nrc") %>% filter(sentiment == "positive" | sentiment == "negative")
np = books %>% group_by(gutenberg_id) %>% unnest_tokens(word, text) %>% anti_join(stop_words, by = "word") %>% inner_join(positive) %>% count(word, sort = T) %>% top_n(40) %>% do(p = ggplot(data=.) + geom_bar(aes(x=reorder(word, -n),y=n),stat="identity") + ggtitle(unique(gutenberg_metadata %>% filter(gutenberg_id == .$gutenberg_id) %>%select(author))) + theme(axis.text.x = element_text(angle = 90, hjust = 1)))
## Joining, by = "word"
## Selecting by n
np$p
## [[1]]

##
## [[2]]

##
## [[3]]

##
## [[4]]

##
## [[5]]

##
## [[6]]

##
## [[7]]

##
## [[8]]

##
## [[9]]

##
## [[10]]

##
## [[11]]

##
## [[12]]

##
## [[13]]

##
## [[14]]

Q5
les_miserablebook <- gutenberg_download(135)
les_miserable <- split(les_miserablebook, rep(1:200, nrow(les_miserablebook)/200))
positive <- get_sentiments("nrc") %>% filter(sentiment == "positive")
negative <- get_sentiments("nrc") %>% filter(sentiment == "negative")
positives <- data.frame()
negatives <- data.frame()
for(i in 1:200){
positives <- rbind(positives, nrow(les_miserable[[i]] %>% unnest_tokens(word, text) %>% anti_join(stop_words, by = "word") %>% inner_join(positive, by = "word")))
negatives <- rbind(negatives, nrow(les_miserable[[i]] %>% unnest_tokens(word, text) %>% anti_join(stop_words, by = "word") %>% inner_join(negative, by = "word")))
}
hchart(positives, "column", hcaes(x = c(1:200), y = positives$X115L))
hchart(negatives, "column", hcaes(x = c(1:200), y = negatives$X104L))
Q6
bigrams = books %>% unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
separate(bigram, c("firstToken", "secondToken"), sep = " ") %>% filter(!firstToken %in% stop_words$word) %>% filter(!secondToken %in% stop_words$word)
bigramcnt <- bigrams %>% count(firstToken, secondToken, sort = TRUE) %>% top_n(30)
## Selecting by n
hchart(bigramcnt, "column", hcaes(x = paste(firstToken, secondToken, sep=" "), y = n))
Q7
bigramcntt <- books %>% unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
separate(bigram, c("firstToken", "secondToken"), sep = " ") %>% filter((firstToken == "he" & !secondToken %in% stop_words$word) | (firstToken == "she" & !secondToken %in% stop_words$word)) %>%
count(firstToken, secondToken, sort = TRUE) %>% top_n(30)
## Selecting by n
hchart(bigramcntt, "column", hcaes(x = paste(firstToken, secondToken, sep=" "), y = n))
Q8
wordspergroup = words %>% group_by(gutenberg_id) %>% anti_join(stop_words, by = 'word') %>% count(word, sort = TRUE)
bigramspergroup = books %>% unnest_tokens(bigram, text, token = "ngrams", n = 2) %>% group_by(gutenberg_id) %>% separate(bigram, c("firstToken", "secondToken"), sep = " ") %>% filter(!firstToken %in% stop_words$word) %>% filter(!secondToken %in% stop_words$word) %>% count(firstToken, secondToken, sort = TRUE) %>% mutate(bigram = paste(firstToken, secondToken, sep=' ')) %>% select(gutenberg_id, n, bigram)
colnames(wordspergroup) = c("gutenberg_id", "token", "occurrence")
colnames(bigramspergroup) = c("gutenberg_id", "occurrence", "token")
wordspergroup$index = seq(1:nrow(wordspergroup))
bigramspergroup$index = seq(1:nrow(bigramspergroup))
for(i in numbers){
unigram <- wordspergroup %>% filter(gutenberg_id == i)
bigram <- bigramspergroup %>% filter(gutenberg_id == i)
u = glm(data <- unigram, formula = log(occurrence) ~ log(index), family="gaussian")
b = glm(data <- bigram, formula = log(occurrence) ~ log(index), family="gaussian")
fit<-cbind(coef(u), coef(b))
print(fit)
print(qplot(log(unigram$index), log(unigram$occurrence)) + geom_abline(slope = fit[2, 1], intercept = fit[1, 1]) + labs(x = "Log(n-gram index)", y = "Log(Number of occurrence)", title = paste(ggtitle(gutenberg_metadata %>% filter(gutenberg_id == i ) %>% select(title)))))
print(qplot(log(bigram$index), log(bigram$occurrence)) + geom_abline(slope = fit[2, 1], intercept = fit[1, 1]) + labs(x = "Log(n-gram index)", y = "Log(Number of occurrence)", title = paste(ggtitle(gutenberg_metadata %>% filter(gutenberg_id == i ) %>% select(title)))))
}
## [,1] [,2]
## (Intercept) 13.700295 5.2547912
## log(index) -1.161758 -0.4956357


## [,1] [,2]
## (Intercept) 14.022850 4.7141446
## log(index) -1.185473 -0.4219228


## [,1] [,2]
## (Intercept) 12.767245 4.3412568
## log(index) -1.055001 -0.3573615


## [,1] [,2]
## (Intercept) 13.970550 4.8868124
## log(index) -1.183173 -0.4451058


## [,1] [,2]
## (Intercept) 13.177782 4.2700786
## log(index) -1.096359 -0.3579688


## [,1] [,2]
## (Intercept) 12.746908 4.241447
## log(index) -1.051379 -0.346086


## [,1] [,2]
## (Intercept) 12.988551 4.4892791
## log(index) -1.084554 -0.3862397


## [,1] [,2]
## (Intercept) 13.238882 4.634127
## log(index) -1.112116 -0.408223


## [,1] [,2]
## (Intercept) 12.610955 4.2030785
## log(index) -1.037098 -0.3404808


## [,1] [,2]
## (Intercept) 14.291325 4.496516
## log(index) -1.204845 -0.391369


## [,1] [,2]
## (Intercept) 12.89749 4.2814483
## log(index) -1.06923 -0.3556469


## [,1] [,2]
## (Intercept) 13.012471 4.1134573
## log(index) -1.070863 -0.3314952


## [,1] [,2]
## (Intercept) 13.074584 4.4391030
## log(index) -1.089667 -0.3763249


## [,1] [,2]
## (Intercept) 15.628977 6.0056962
## log(index) -1.340191 -0.5911522


Q9
wordspergroup <- les_miserablebook %>% unnest_tokens(word, text) %>% anti_join(stop_words, by = 'word') %>% count(word, sort = TRUE)
bigramspergroup <- les_miserablebook %>% unnest_tokens(bigram, text, token = "ngrams", n = 2) %>% separate(bigram, c("firstToken", "secondToken"), sep = " ") %>% filter(!firstToken %in% stop_words$word) %>% filter(!secondToken %in% stop_words$word) %>% count(firstToken, secondToken, sort = TRUE) %>% mutate(bigram = paste(firstToken, secondToken, sep=' ')) %>% select("bigram","n")
colnames(wordspergroup) = c("token", "occurrence")
colnames(bigramspergroup) = c("token", "occurrence")
wordspergroup$index = seq(1:nrow(wordspergroup))
bigramspergroup$index = seq(1:nrow(bigramspergroup))
u = glm(data <- wordspergroup, formula = log(occurrence) ~ log(index), family="gaussian")
b = glm(data <- bigramspergroup, formula = log(occurrence) ~ log(index), family="gaussian")
fit<-cbind(coef(u), coef(b))
print(fit)
## [,1] [,2]
## (Intercept) 12.052251 2.8524936
## log(index) -1.210877 -0.2824447
print(qplot(log(wordspergroup$index), log(wordspergroup$occurrence)) + geom_abline(slope = fit[2, 1], intercept = fit[1, 1]) + labs(x = "Log(n-gram index)", y = "Log(Number of occurrence)", title = paste(i, "Double logarithmic plot of the wordss occurrence distribution", sep = ' ')))

print(qplot(log(bigramspergroup$index), log(bigramspergroup$occurrence)) + geom_abline(slope = fit[2, 1], intercept = fit[1, 1]) + labs(x = "Log(n-gram index)", y = "Log(Number of occurrence)", title = paste(i, "Double logarithmic plot of the bigrams occurrence distribution", sep = ' ')))
